from google.colab import drive
drive.mount('/content/drive')
#upload all files from the lib folder here
from google.colab import files
def getLocalFiles():
_files = files.upload()
if len(_files) >0:
for k,v in _files.items():
open(k,'wb').write(v)
getLocalFiles()
!pip install nltk
import nltk
nltk.download('all')
!pip install catboost
!pip install xgboost
!pip install jsonify
!pip install requests
!pip install flask-ngrok
!pip install werkzeug
!pip install simplejson
!pip install Flask-Session
!!pip install ChatterBot
#sys.path.append('C:/Applications/Machine Learning/NLP/CapstoneProjectNLP/lib')
import sys
sys.path.append('../')
nltk.download('stopwords')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
import data_augmentation
import preprocess_data
import train_ml_model
import train_DL_model
import encode_data
import vectorizer
import time
import feature_extraction
from data_augmentation import *
from preprocess_data import *
from train_ml_model import *
from train_DL_model import *
from encode_data import *
from vectorizer import *
from feature_extraction import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score
from nltk import word_tokenize, pos_tag, pos_tag_sents
from sklearn import metrics
from sklearn.metrics import mean_squared_error,log_loss
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from keras.initializers import Constant
from keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional,Input,GlobalMaxPool1D,SpatialDropout1D
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Model,load_model
import keras.optimizers
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers import Dense
from keras.initializers import Constant
from keras.callbacks import ReduceLROnPlateau,EarlyStopping
from keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional
import keras.optimizers
import tensorflow as tf
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,BaggingClassifier,GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import multilabel_confusion_matrix,classification_report,confusion_matrix,accuracy_score,f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
# accident_safety_data=pd.read_csv("C:/Applications/Machine Learning/NLP/CapstoneProjectNLP/data/hse_data.csv")
accident_safety_data=pd.read_csv("/content/hse_data.csv")
accident_safety_data.head()
accident_safety_data.columns
We can see that the columns "Unnamed" is unwanted, as it will not help us in our analysis. Also, Data column should be renamed to "Date". Therefore, let's drop the column "Unnamed" and rename the column "Data" to "Date"
#create a backup of the dataset before we make any changes to it
accident_safety_data_new=accident_safety_data.copy(True)
accident_safety_data_new.head()
#dropping "Unnamed" column
accident_safety_data_new.drop('Unnamed: 0',axis='columns', inplace=True)
#renaming "Data" column to "Date"
accident_safety_data_new.rename(columns = {'Data':'Date'}, inplace = True)
#renaming "Genre" column to "Gender"
accident_safety_data_new.rename(columns = {'Genre':'Gender'}, inplace = True)
#renaming "Employee or Third party" column to "Employee Type"
accident_safety_data_new.rename(columns = {'Employee or Third Party':'Employee Type'}, inplace = True)
accident_safety_data_new.duplicated().sum()
#Let us view the duplicate records
duplicates = accident_safety_data_new.duplicated()
accident_safety_data_new[duplicates]
accident_safety_data_new.drop_duplicates(inplace=True)
#Let us check the shape of our dataset
accident_safety_data_new.shape
We can see that the dataset has 425 rows and 10 columns
accident_safety_data_new.head()
#Let us check for missing values in the dataset
accident_safety_data_new.isna().apply(pd.value_counts)
We can see that this dataset has no null values.
#Let us now check the datatype of the dataset and also get to know some more details
accident_safety_data_new.dtypes
Here, we can see that all the columns of the dataset are of "object" datatype. Coming to the type of data present in each column, we can see that there is a column "Date", which means it holds time series data. All other columns except "Description" are of categorical datatype.
accident_safety_data_new.describe().T
From the above table, we can infer the below:
This dataset contains accident data of 3 countries, out of which Country1 has the most number of accidents.
The data is collected from 3 types of industry sectors.Local_3 has the most number of accidents.
There are 5 major accident levels in which this dataset has been classified.309 accidents are of accident level 1, making it the most frequent accident type. This also means that the data is not distributed evenly.
The data is a consolidation of accidents faced by employees as well as third party vendors and others. Third party employees have faced the most number of accidents according to this dataset.
396 male employees have been reported to have accidents, which mean the distribution of data in this case is also not evenly balanced.
33 different types of critical risks have been identified in the dataset.
We have seen that there are quite a few categorical columns in the dataset which can be encoded to numerical values e.g.
Local
Accident Level
Potential Accident Level
UNIVARIATE ANALYSIS
Let us check the distribution of data based on accident levels
fig = px.histogram(accident_safety_data_new, x="Accident Level",title='Distribution by Accident Level')
fig.show()
We can see that the distribution of Accident Levels is highly imbalanced in the dataset
fig = px.histogram(accident_safety_data_new, x="Countries" ,title='Distribution by countries')
fig.show()
We can see that "Country_01" has the most number of accident cases.
Let us now see the distribution of accidents with respect to the type of employee.(Employee/ThirdParty/ThirdPartyRemote)
fig = px.histogram(accident_safety_data_new, x="Employee Type",title='Distribution by Employee type')
fig.show()
From the graph it is very clear that accidents have happened in almost equal proportions among permanent employees or third party contractors, with thrid party contractors a bit on the higher side.
Let us also check the distribution of accidents as per industry sector.
fig = px.histogram(accident_safety_data_new, x="Industry Sector",title='Distribution by Industry sector')
fig.show()
We can see that majority of accidents have happened in the mining sector, followed by metal industry and other type of industries.
We will now see the distribution of accidents as per Gender
fig = px.histogram(accident_safety_data_new, x="Gender",title='Distribution by Gender')
fig.show()
Clearly, the distribution of accidents is imbalanced when checked by "Genre". The count of accidents in males is way higher than that in females.
Lastly, let us check the distribution by Locals.
fig = px.histogram(accident_safety_data_new, x="Local",title='Distribution by local cities')
fig.show()
fig = px.histogram(accident_safety_data_new, x="Potential Accident Level",title='Distribution by potential accident level')
fig.show()
We can see that most of the people have met with accident having level 1.
fig = px.histogram(accident_safety_data_new, x="Critical Risk",title='Distribution by critical risk')
fig.show()
We can see from the graph that the Critical risk category "Others" have the most number of accidents. This means we are not clear about the exact risk factor associated with accidents in this dataset.
BIVARIATE ANALYSIS
Let us write a function to see how does the accident level varies with the Industry Sector and Countries
def plothistograms(data,column_name_x,column_name_y,value,title):
fig = px.histogram(data,
x=column_name_x,
color=column_name_y,
barmode=value,
title=title
)
fig.show()
plothistograms(accident_safety_data_new,"Countries","Accident Level","relative",'Distribution of various accident levels per country.')
Observations from the above graph:
plothistograms(accident_safety_data_new,"Industry Sector","Accident Level","group",'Distribution of various accident levels per industry sector.')
The most number of accidents have occured in the Mining Industry in Country 1 so far, followed by the metal industry, also in Country 1.
plothistograms(accident_safety_data_new,"Local","Industry Sector","stack",'Distribution of various industry sectors per local city.')
plothistograms(accident_safety_data_new,"Countries","Industry Sector","stack","Distribution of various accident levels per country.")
plothistograms(accident_safety_data_new,"Industry Sector","Accident Level","stack",'Distribution of various accident levels per industry sector.')
accident_safety_data_new['Critical Risk'].value_counts()
df = accident_safety_data_new.copy(True)
lb_make = LabelEncoder()
df['Accident_Level_labelencoded'] = lb_make.fit_transform(df['Accident Level'])
df['Accident_Level_labelencoded']=df['Accident_Level_labelencoded']+1
fig = px.scatter(df, x="Critical Risk", y="Accident Level", color="Industry Sector",
size='Accident_Level_labelencoded', hover_data=['Industry Sector']
,title="Spread of Accident Levels by Critical risk category per Industry sector")
fig.show()
From the above graph we can see the following:
df = accident_safety_data_new.copy(True)
lb_make = LabelEncoder()
df['Accident_Level_labelencoded'] = lb_make.fit_transform(df['Accident Level'])
df['Accident_Level_labelencoded']=df['Accident_Level_labelencoded']+1
fig = px.scatter(df, x="Critical Risk", y="Accident Level", color="Employee Type",
size='Accident_Level_labelencoded', hover_data=['Critical Risk']
,title="Spread of Accident Levels by Critical Risk category per Employee Type"
,width=1200,height=600)
fig.show()
From the above graph we can make the below observations:
Mostly third party contractors(both on site and remote) have had accidents of notably all Accident Levels in the "Others" risk category.
"Pressed" risks are the second most dangerous ones where employees and contractors both have had accidents.
plothistograms(accident_safety_data_new,"Potential Accident Level","Industry Sector","stack",'Distribution of Potential Accident Level by Industry Sector')
NLP analysis
Let us see the most frequent words used for each accident level now.
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='I'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350,
random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level I", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='II'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350,
random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level II", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='III'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350,
random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level III", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='IV'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350,
random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level IV", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
Accident Level V
from wordcloud import WordCloud
keywords = " ".join(line for line in accident_safety_data_new[accident_safety_data_new['Accident Level']=='V'].Description)
word_cloud= WordCloud(width=1250, height=625, max_font_size=350,
random_state=42).generate(keywords)
plt.figure(figsize=(20, 10))
plt.title("Most frequent words used to describe Accident Level V", size=20, weight="bold")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()
DATA AUGMENTATION
#Let us first create a dataset using only the class variable "Accident Level" and Description column.
accident_safety_data_trimmed=accident_safety_data_new.copy(True)
accident_safety_data_trimmed['Accident_Level']=accident_safety_data_trimmed.apply(lambda col: str(col['Accident Level']), axis=1)
accident_safety_data_trimmed['Description']=accident_safety_data_trimmed.apply(lambda col: str(col['Description']), axis=1)
accident_safety_data_trimmed=accident_safety_data_trimmed[['Accident_Level','Description']]
accident_safety_data_trimmed.head()
labels, frequencies = np.unique(accident_safety_data_trimmed.Accident_Level.values, return_counts=True)
fig = px.pie(accident_safety_data_trimmed, values=frequencies, names=labels, title='Frequency of Description by Accident Level')
fig.show()
We can clearly see that the Description column is imbalanced in the dataset. Most of the description is present only for Accident Level I(0).
We will now check the exact counts of Descriptions per Accident level.
accident_safety_data_trimmed.Accident_Level.value_counts().values
for u in accident_safety_data_trimmed.Accident_Level.unique().tolist():
print(u)
Let us first divide our data into train and test samples
We will try augmentation techniques so that the data is balanced properly before it is passed into the dataset.
EDA
Using EDA let us perform data augmentation
Let us divide data of each Accident Level in different dataframes
options=['I']
df_0=accident_safety_data_trimmed.copy(True)
df_0=df_0.loc[df_0['Accident_Level'].isin(options)]
df_0.shape
options=['II']
df_1=accident_safety_data_trimmed.copy(True)
df_1=df_1.loc[df_1['Accident_Level'].isin(options)]
df_1.shape
options=['III']
df_2=accident_safety_data_trimmed.copy(True)
df_2=df_2.loc[df_2['Accident_Level'].isin(options)]
df_2.shape
options=['IV']
df_3=accident_safety_data_trimmed.copy(True)
df_3=df_3.loc[df_3['Accident_Level'].isin(options)]
df_3.shape
options=['V']
df_4=accident_safety_data_trimmed.copy(True)
df_4=df_4.loc[df_4['Accident_Level'].isin(options)]
df_4.shape
Now, we will augment each dataset seperately. Here the gen_eda function from data_augmentation.py takes in the below parameters:
dataset - dataframe name alpha_sr - percentage of words in the dataset we want to replace with synonyms.
alpha_ri - percentage of words in the dataset we want to randomly insert.
alpha_rs - percentage of words in the dataset we want to randomly swap.
alpha_rd - percentage of words in the dataset we want to randomly delete.
num_aug - total number of augmented sentences we want per sentence in the dataset.
df_0_up=gen_eda(df_0,0.7,0.1,0.2,0.15,2)
df_1_up=gen_eda(df_1,0.7,0.2,0.2,0.1,22)
df_2_up=gen_eda(df_2,0.7,0.2,0.2,0.1,29)
df_3_up=gen_eda(df_3,0.7,0.2,0.2,0.1,30)
df_4_up=gen_eda(df_4,0.7,0.2,0.2,0.1,114)
accident_safety_data_upsampled = pd.concat([df_0_up,df_1_up,df_2_up,df_3_up,df_4_up])
accident_safety_data_upsampled.describe().T
accident_safety_data_upsampled.shape
labels, frequencies = np.unique(accident_safety_data_upsampled.Accident_Level.values, return_counts=True)
fig = px.pie(accident_safety_data_upsampled, values=frequencies, names=labels, title='Frequency of Description by Accident Level')
fig.show()
accident_safety_data_upsampled["Description_DL"] = accident_safety_data_upsampled["Description"].apply(lambda x: clean_DL_data1(x))
accident_safety_data_upsampled["Description_ML"] = accident_safety_data_upsampled["Description"].apply(lambda x: clean_data(x))
accident_safety_data_upsampled.head(10)
print(accident_safety_data_upsampled.Description[1])
print(accident_safety_data_upsampled.Description_DL[1])
print(accident_safety_data_upsampled.Description_ML[1])
Named entity recognition
accident_safety_data_upsampled.to_csv('upsampled_accident_safety_data.csv',index=False,encoding='utf-8')
accident_safety_upsampled_pos=pd.read_csv('upsampled_accident_safety_data.csv')
accident_safety_upsampled_pos['POSTags'] = pos_tag_sents(accident_safety_upsampled_pos['Description_DL'].apply(word_tokenize).tolist())
accident_safety_upsampled_pos.POSTags[:1]
Feature extraction. We will try the below vectorizers
We will first work with data cleaned for machine learning and then data cleaned for deep learning.
Using Count Vectorizer
#unigrams
count_train_cv_ML_1,features_cv_ML_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1)
count_train_cv_DL_1,features_cv_DL_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1)
count_train_cv_1,features_cv_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description,1)
x_orig_cv_1=pd.DataFrame(count_train_cv_1,columns=list(features_cv_1))
x_DL_cv_1=pd.DataFrame(count_train_cv_DL_1,columns=list(features_cv_DL_1))
x_ML_cv_1=pd.DataFrame(count_train_cv_ML_1,columns=list(features_cv_ML_1))
#unigrams and bigrams
count_train_cv_ML_2,features_cv_ML_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2)
count_train_cv_DL_2,features_cv_DL_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2)
count_train_cv_2,features_cv_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description,2)
x_orig_cv_2=pd.DataFrame(count_train_cv_2,columns=list(features_cv_2))
x_DL_cv_2=pd.DataFrame(count_train_cv_DL_2,columns=list(features_cv_DL_2))
x_ML_cv_2=pd.DataFrame(count_train_cv_ML_2,columns=list(features_cv_ML_2))
#unigrams, bigrams and trigrams
# count_train_cv_ML_3,features_cv_ML_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3)
# count_train_cv_DL_3,features_cv_DL_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3)
# count_train_cv_3,features_cv_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description,3)
# x_orig_cv_3=pd.DataFrame(count_train_cv_3,columns=list(features_cv_3))
# x_DL_cv_3=pd.DataFrame(count_train_cv_DL_3,columns=list(features_cv_DL_3))
# x_ML_cv_3=pd.DataFrame(count_train_cv_ML_3,columns=list(features_cv_ML_3))
#Let us see the shape of the dataset
#Deep learning dataset
print('Number of unigram features generated in the deep learning dataset:',x_DL_cv_1.shape)
print('Number of bigrams features generated in the deep learning dataset:',x_DL_cv_2.shape)
# print('Number of trigrams features generated in the deep learning dataset:',x_DL_cv_3.shape)
#Machine learning dataset
print('Number of unigram features generated in the machine learning dataset:',x_ML_cv_1.shape)
print('Number of bigrams features generated in the machine learning dataset:',x_ML_cv_2.shape)
# print('Number of trigrams features generated in the machine learning dataset:',x_ML_cv_3.shape)
Since the number of features generated are very large in number, we will suffer from curse of dimensionality, hence reducing the number of features to 750
#unigrams
count_train_cv_ML_1,features_cv_ML_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1,750)
count_train_cv_DL_1,features_cv_DL_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1,750)
count_train_cv_1,features_cv_1 = count_vectorizer_features(accident_safety_upsampled_pos.Description,1,750)
x_orig_cv_1=pd.DataFrame(count_train_cv_1,columns=list(features_cv_1))
x_DL_cv_1=pd.DataFrame(count_train_cv_DL_1,columns=list(features_cv_DL_1))
x_ML_cv_1=pd.DataFrame(count_train_cv_ML_1,columns=list(features_cv_ML_1))
#unigrams and bigrams
count_train_cv_ML_2,features_cv_ML_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2,750)
count_train_cv_DL_2,features_cv_DL_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2,750)
count_train_cv_2,features_cv_2 = count_vectorizer_features(accident_safety_upsampled_pos.Description,2,750)
x_orig_cv_2=pd.DataFrame(count_train_cv_2,columns=list(features_cv_2))
x_DL_cv_2=pd.DataFrame(count_train_cv_DL_2,columns=list(features_cv_DL_2))
x_ML_cv_2=pd.DataFrame(count_train_cv_ML_2,columns=list(features_cv_ML_2))
# #unigrams,bigrams and trigrams
# count_train_cv_ML_3,features_cv_ML_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3,750)
# count_train_cv_DL_3,features_cv_DL_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3,750)
# count_train_cv_3,features_cv_3 = count_vectorizer_features(accident_safety_upsampled_pos.Description,3,750)
# x_orig_cv_3=pd.DataFrame(count_train_cv_3,columns=list(features_cv_3))
# x_DL_cv_3=pd.DataFrame(count_train_cv_DL_3,columns=list(features_cv_DL_3))
# x_ML_cv_3=pd.DataFrame(count_train_cv_ML_3,columns=list(features_cv_ML_3))
Deep learning features
# #print unigram features
print(count_train_cv_DL_1.shape)
print(features_cv_DL_1[:50])
# #print bigram features
print(count_train_cv_DL_2.shape)
print(features_cv_DL_2[:50])
# # #print trigram features
# # print(count_train_cv_DL_3.shape)
# # print(features_cv_DL_3[:50])
Machine learning features
#print unigram features
print(count_train_cv_ML_1.shape)
print(features_cv_ML_1[:50])
#print bigram features
print(count_train_cv_ML_2.shape)
print(features_cv_ML_2[:50])
# #print trigram features
# # print(count_train_cv_ML_3.shape)
# # print(features_cv_ML_3[:50])
x_ML_cv_2.head()
#unigrams
count_train_tfidf_ML_1,features_tfidf_ML_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1)
count_train_tfidf_DL_1,features_tfidf_DL_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1)
count_train_tfidf_1,features_tfidf_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1)
x_orig_tfidf_1=pd.DataFrame(count_train_tfidf_1,columns=list(features_tfidf_1))
x_DL_tfidf_1=pd.DataFrame(count_train_tfidf_DL_1,columns=list(features_tfidf_DL_1))
x_ML_tfidf_1=pd.DataFrame(count_train_tfidf_ML_1,columns=list(features_tfidf_ML_1))
#bigrams
count_train_tfidf_ML_2,features_tfidf_ML_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2)
count_train_tfidf_DL_2,features_tfidf_DL_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2)
count_train_tfidf_2,features_tfidf_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2)
x_orig_tfidf_2=pd.DataFrame(count_train_tfidf_2,columns=list(features_tfidf_2))
x_DL_tfidf_2=pd.DataFrame(count_train_tfidf_DL_2,columns=list(features_tfidf_DL_2))
x_ML_tfidf_2=pd.DataFrame(count_train_tfidf_ML_2,columns=list(features_tfidf_ML_2))
#trigrams
# count_train_tfidf_ML_3,features_tfidf_ML_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3)
# count_train_tfidf_DL_3,features_tfidf_DL_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3)
# count_train_tfidf_3,features_tfidf_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3)
# x_orig_tfidf_3=pd.DataFrame(count_train_tfidf_3,columns=list(features_tfidf_3))
# x_DL_tfidf_3=pd.DataFrame(count_train_tfidf_DL_3,columns=list(features_tfidf_DL_3))
# x_ML_tfidf_3=pd.DataFrame(count_train_tfidf_ML_3,columns=list(features_tfidf_ML_3))
#Let us see the shape of the dataset
#Deep learning dataset
print('Number of unigram features generated in the deep learning dataset:',x_DL_tfidf_1.shape)
print('Number of bigrams features generated in the deep learning dataset:',x_DL_tfidf_2.shape)
# print('Number of trigrams features generated in the deep learning dataset:',x_DL_tfidf_3.shape)
#Machine learning dataset
print('Number of unigram features generated in the machine learning dataset:',x_ML_tfidf_1.shape)
print('Number of bigrams features generated in the machine learning dataset:',x_ML_tfidf_2.shape)
# print('Number of trigrams features generated in the machine learning dataset:',x_ML_tfidf_3.shape)
#Reducing the features to 750
#unigrams
count_train_tfidf_ML_1,features_tfidf_ML_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,1,750)
count_train_tfidf_DL_1,features_tfidf_DL_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1,750)
count_train_tfidf_1,features_tfidf_1 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,1,750)
x_orig_tfidf_1=pd.DataFrame(count_train_tfidf_1,columns=list(features_tfidf_1))
x_DL_tfidf_1=pd.DataFrame(count_train_tfidf_DL_1,columns=list(features_tfidf_DL_1))
x_ML_tfidf_1=pd.DataFrame(count_train_tfidf_ML_1,columns=list(features_tfidf_ML_1))
#bigrams
count_train_tfidf_ML_2,features_tfidf_ML_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,2,750)
count_train_tfidf_DL_2,features_tfidf_DL_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2,750)
count_train_tfidf_2,features_tfidf_2 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,2,750)
x_orig_tfidf_2=pd.DataFrame(count_train_tfidf_2,columns=list(features_tfidf_2))
x_DL_tfidf_2=pd.DataFrame(count_train_tfidf_DL_2,columns=list(features_tfidf_DL_2))
x_ML_tfidf_2=pd.DataFrame(count_train_tfidf_ML_2,columns=list(features_tfidf_ML_2))
#trigrams
# count_train_tfidf_ML_3,features_tfidf_ML_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_ML,3,750)
# count_train_tfidf_DL_3,features_tfidf_DL_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3,750)
# count_train_tfidf_3,features_tfidf_3 = tfidf_vectorizer_features(accident_safety_upsampled_pos.Description_DL,3,750)
# x_orig_tfidf_3=pd.DataFrame(count_train_tfidf_3,columns=list(features_tfidf_3))
# x_DL_tfidf_3=pd.DataFrame(count_train_tfidf_DL_3,columns=list(features_tfidf_DL_3))
# x_ML_tfidf_3=pd.DataFrame(count_train_tfidf_ML_3,columns=list(features_tfidf_ML_3))
Deep learning features for TF IDF vectorizer
Machine learning features for TF IDF vectorizer
#print unigram features
print(count_train_tfidf_ML_1.shape)
print(features_tfidf_ML_1[:50])
#print bigram features
print(count_train_tfidf_ML_2.shape)
print(features_tfidf_ML_2[:50])
#print trigram features
# print(count_train_tfidf_ML_3.shape)
# print(features_tfidf_ML_3[:50])
Let us now one hot encode the class variable Accident Level
lb_make = LabelEncoder()
accident_safety_upsampled_pos['Accident_Level'] = lb_make.fit_transform(accident_safety_upsampled_pos['Accident_Level'])
y_DL = pd.get_dummies(accident_safety_upsampled_pos['Accident_Level']).values
y_ML = pd.get_dummies(accident_safety_upsampled_pos['Accident_Level']).values
y_orig = pd.get_dummies(accident_safety_upsampled_pos['Accident_Level']).values
x_ML_cv_1 = x_ML_cv_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_ML_cv_2 = x_ML_cv_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_ML_cv_3 = x_ML_cv_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_cv_1 = x_DL_cv_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_cv_2 = x_DL_cv_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_DL_cv_3 = x_DL_cv_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_tfidf_1 = x_DL_tfidf_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_DL_tfidf_2 = x_DL_tfidf_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_DL_tfidf_3 = x_DL_tfidf_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_ML_tfidf_1 = x_ML_tfidf_1.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
x_ML_tfidf_2 = x_ML_tfidf_2.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
# x_ML_tfidf_3 = x_ML_tfidf_3.join(accident_safety_upsampled_pos['Accident_Level'].reset_index(drop=True))
Now our data is ready, so we can divide the data into test and train.
Dataset to be used for deep learning
Count Vectorized data
Unigrams : x_DL_cv_1 , y_DL
Bigrams : x_DL_cv_2 , y_DL
Trigrams : x_DL_cv_3 , y_DL
TF IDF Vectorized data
Unigrams : x_DL_tfidf_1 , y_DL
Bigrams : x_DL_tfidf_2 , y_DL
Trigrams : x_DL_tfidf_3 , y_DL
Dataset to be used for machine learning models
Count Vectorized data
Unigrams : x_ML_cv_1 , y_ML
Bigrams : x_ML_cv_2 , y_ML
Trigrams : x_ML_cv_3 , y_ML
TF IDF Vectorized data
Unigrams : x_ML_tfidf_1 , y_ML
Bigrams : x_ML_tfidf_2 , y_ML
Trigrams : x_ML_tfidf_3 , y_ML
Let us now input this data into machine learning models
Step 1
X=x_ML_tfidf_1.drop(['Accident_Level'],axis=1)
Y=x_ML_tfidf_1.Accident_Level
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = y_ML)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# For multiclass problems, only 'newton-cg', 'sag', 'saga' and 'lbfgs' handle multinomial loss; 'liblinear' is limited to one-versus-rest schemes.
resultsDf = pd.DataFrame()
# Building a Linear Regression model
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state = 1)
# Train and Test the model
resultsDf = train_test_model(lr, 'Logistic Regression', X_train, X_test, y_train, y_test, 'none', 1, 'no', 'yes', 'no')
# Store the accuracy results for each model in a dataframe for final comparison
resultsDf
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
Let us now perform the same steps with
Step 1
#unigrams and bigrams
X=x_ML_tfidf_2.drop(['Accident_Level'],axis=1)
Y=x_ML_tfidf_2.Accident_Level
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = y_ML)
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
X_train.shape,X_test.shape,y_train.shape,y_test.shape
# #unigrams bigrams and trigrams
# X=x_ML_tfidf_3.drop(['Accident_Level'],axis=1)
# Y=x_ML_tfidf_3.Accident_Level
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = y_ML)
# train_test_allmodels(X_train, X_test, y_train, y_test, 'no')
We can see that SVC gives us the best accuracy and the lowest loss as compared to the other machine learning models.
It is followed by the Catboost classifier which gives us the next best results.
Let us see what results we get using count vectorized data.
X=x_ML_cv_1.drop(['Accident_Level'],axis=1)
Y=x_ML_cv_1.Accident_Level
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = y_ML)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
X=x_ML_cv_2.drop(['Accident_Level'],axis=1)
Y=x_ML_cv_2.Accident_Level
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = y_ML)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
train_test_allmodels(X_train, X_test, y_train, y_test, 'no','yes','no')
# X=x_ML_cv_3.drop(['Accident_Level'],axis=1)
# Y=x_ML_cv_3.Accident_Level
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = y_ML)
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)
# train_test_allmodels(X_train, X_test, y_train, y_test, 'no')
There is not much of a difference between the results of count vectorized data and tf idf vectorized data. In both cases, SVC performs the best followed by the catboost classifier. One more observation is that the data performs the best using tf idf vectorizer (bi-grams) with a training accuracy of 99% and a test accuracy of 96.1%. The precision, recall and F1 scores are also very good approx 96.1%, making it the best performed model.
We will now work on the deep learning data. Will pass the deep learning data to the below models:
First, we will embed our deep learning data using Glove embeddings
my_corpus = []
for text in accident_safety_data_upsampled['Description_DL']:
words = [word.lower() for word in word_tokenize(text)]
my_corpus.append(words)
num_words = len(my_corpus)
print(num_words)
accident_safety_data_upsampled.head()
X = accident_safety_data_upsampled['Description_DL']
Y = accident_safety_data_upsampled['Accident_Level']
#Labelling the column Accident_Level
Y = LabelEncoder().fit_transform(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1, stratify = Y)
print('X_text_train shape : ({0})'.format(X_train.shape[0]))
print('y_text_train shape : ({0},)'.format(y_train.shape[0]))
print('X_text_test shape : ({0})'.format(X_test.shape[0]))
print('y_text_test shape : ({0},)'.format(y_test.shape[0]))
Since we will be passing this data to a deep learning model, we will have to one hot encode the Y variable.
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
Step 1 : convert the words into thier corresponding numeric indexes.
tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
Step 2: Since the length of the sentences returned by the tokenizer are of varying lengths, we will need to pad the sequences
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)
max_length = 750
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)
#Let us now create our test and validation set as 50 50
# X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size = 0.5, random_state=2)
print(X_train.shape)
print(X_test.shape)
# # print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
# # print(y_val.shape)
Let us make a weight matrix of all words in corpus using pre-trained glove embeddings
import numpy as np
embedding = {}
with open("/content/drive/MyDrive/Colab Notebooks/NLP/CapstoneProjectNLP/data/glove.6B.200d.txt") as file:
for line in file:
values = line.split()
word = values[0]
vectors = np.asarray(values[1:], 'float32')
embedding[word] = vectors
file.close()
embedding_size = 200
embeddings_dictionary = dict()
embedding_matrix = np.zeros((vocab_size, embedding_size))
for i, word in tokenizer.index_word.items():
if i < (num_words+1):
vector = embedding.get(word)
if vector is not None:
embedding_matrix[i] = vector
print(len(embedding.values()))
print(embedding_matrix.shape)
Simple NN model
epochs=20
model = Sequential()
model.add(Dense(32, input_shape=(len(X_train[0]),), activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(len(y_train[0]), activation='softmax'))
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#fitting and saving the model
hist = model.fit(np.array(X_train), np.array(y_train), validation_split=0.2 , epochs=epochs, batch_size=5, verbose=1)
# evaluate the keras model
train_accuracy = model.evaluate(X_train, y_train, batch_size=5, verbose=0)
test_accuracy = model.evaluate(X_test, y_test, batch_size=5, verbose=0)
print(train_accuracy,test_accuracy)
epochs = range(len(hist.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, hist.history['loss'], label = 'train')
plt.plot(epochs, hist.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
LSTM
#embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(deep_inputs)
epochs=2
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False))
#model.add(SpatialDropout1D(0.01))
model.add(LSTM(32, dropout=0.01, recurrent_dropout=0.01))
model.add(Dense(10,activation='relu'))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
history = model.fit(X_train, y_train, epochs=epochs, batch_size=5,validation_split=0.2)
# evaluate the keras model
train_accuracy = model.evaluate(X_train, y_train, batch_size=5, verbose=0)
test_accuracy = model.evaluate(X_test, y_test, batch_size=5, verbose=0)
print(train_accuracy,test_accuracy)
epochs = range(len(history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, history.history['loss'], label = 'train')
plt.plot(epochs, history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
# #Pickle the model for future use
# model.save('lstm.h5')
Bi Directional LSTM
class Metrics(tf.keras.callbacks.Callback):
def __init__(self, validation_data=()):
super().__init__()
self.validation_data = validation_data
def on_train_begin(self, logs={}):
self.val_f1s = []
self.val_recalls = []
self.val_precisions = []
def on_epoch_end(self, epoch, logs={}):
xVal, yVal, target_type = self.validation_data
if target_type == 'multi_class':
val_predict_classes = model.predict_classes(xVal, verbose=0) # Multiclass
else:
val_predict_classes = (np.asarray(self.model.predict(xVal))).round() # Multilabel
val_targ = yVal
_val_f1 = f1_score(val_targ, val_predict_classes, average='micro')
_val_recall = recall_score(val_targ, val_predict_classes, average='micro')
_val_precision = precision_score(val_targ, val_predict_classes, average='micro')
self.val_f1s.append(_val_f1)
self.val_recalls.append(_val_recall)
self.val_precisions.append(_val_precision)
#print("— train_f1: %f — train_precision: %f — train_recall %f" % (_val_f1, _val_precision, _val_recall))
return
# Build a Bi-directional LSTM Neural Network
epochs=20
deep_inputs = Input(shape=(max_length,))
embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = Bidirectional(LSTM(128, return_sequences = True))(embedding_layer)
max_pool_layer_1 = GlobalMaxPool1D()(LSTM_Layer_1)
# drop_out_layer_1 = Dropout(0.5, input_shape = (256,))(max_pool_layer_1)
# dense_layer_1 = Dense(128, activation = 'relu')(drop_out_layer_1)
# drop_out_layer_2 = Dropout(0.5, input_shape = (128,))(dense_layer_1)
# dense_layer_2 = Dense(64, activation = 'relu')(max_pool_layer_1)
# drop_out_layer_3 = Dropout(0.01, input_shape = (64,))(dense_layer_2)
#(drop_out_layer_3)
dense_layer_3 = Dense(32, activation = 'relu')(max_pool_layer_1)
drop_out_layer_4 = Dropout(0.01, input_shape = (32,))(dense_layer_3)
dense_layer_4 = Dense(10, activation = 'relu')(drop_out_layer_4)
drop_out_layer_5 = Dropout(0.01, input_shape = (10,))(dense_layer_4)
dense_layer_5 = Dense(5, activation='softmax')(drop_out_layer_5)
#dense_layer_3 = Dense(5, activation='softmax')(drop_out_layer_3)
# LSTM_Layer_1 = LSTM(128)(embedding_layer)
# dense_layer_1 = Dense(5, activation='softmax')(LSTM_Layer_1)
# model = Model(inputs=deep_inputs, outputs=dense_layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_5)
#model = Model(inputs=deep_inputs, outputs=dense_layer_3)
opt = SGD(learning_rate=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
print(model.summary())
# Use earlystopping
# callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, min_delta=0.001)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7, min_delta=1E-3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5, min_delta=1E-4)
target_type = 'multi_label'
metrics = Metrics(validation_data=(X_train, y_train, target_type))
# fit the keras model on the dataset
training_history = model.fit(X_train, y_train, epochs=epochs, batch_size=8, verbose=1,validation_split=0.2, callbacks=[rlrp, metrics])
# evaluate the keras model
train_accuracy = model.evaluate(X_train, y_train, batch_size=5, verbose=0)
test_accuracy = model.evaluate(X_test, y_test, batch_size=5, verbose=0)
print(train_accuracy,test_accuracy)
epochs = range(len(training_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, training_history.history['loss'], label = 'train')
plt.plot(epochs, training_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
epochs = range(len(training_history.history['val_acc'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, training_history.history['acc'], label = 'train')
plt.plot(epochs, training_history.history['val_acc'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
y_pred=model.predict(X_test)
y_pred
#Pickle the model for future use
model.save('bidirectional_lstm_model.h5')
Conclusion:
We can see from the above scores that the bi directional LSTM model has performed the best out of all machine learning and deep learning models. The accuracy is very high and the loss is also very low. Since the bidirectional LSTM performed the best we will be working with the chatbot using this model . Let us first pickle the best machine learning model(SVC) and deep learning model(Bi Direcional LSTM).
SVC with unigrams has given us the best training (97% approx) and test (96% approx) scores hence we have picked this model for machine learning.The code for the pickled file is in the file train_ml_model.py.
Let us now build the UI for a chatbot.
!pip install chatterbot-corpus
!python app.py